#===============================================================================
#  File:    VAR.R
#  Date:    December, 2021 
#  Author:  Natalia Umansky
#  Purpose: Producing the Vector Autoregression Model (VAR) used in the Paper and reproducing Figure 5

# This script relies on the VAR analysis used by Barbará et al. (2019) and Gilardi et al. (2021)
#===============================================================================

# LIBRARIES
#===============================================================================

library(data.table)
library(tidyverse)
library(lubridate)
library(vars)
library(boot)
library(rio)
library(BBmisc)
library(forecast)

library(dplyr)
library(ggplot2)
library(stringr)
library(forcats)
library(showtext)
library(ggforce)

# IMPORT DATA
#===============================================================================

Groups <- c("Politicians", "Media", "Citizens", "Advocates", "Friends")

for (i in Groups){
  df <- fread(paste0("~/pred_",tolower(i),".csv"))
  assign(i, df)
}

# FILTERING SECURITIZING TWEETS THAT DISCUSS THE AMAZON RAINFOREST FIRES
#===============================================================================

ALL <- rbind(Politicians, Media, Advocates, Friends, Citizens)
ALL$text <- tolower(ALL$text)

amazon_dictionary <- "lungs of the planet|lungs of our planet|the amazon|lamazonie|amazonfire|actforamazon|amazon fires|rainforest|theamazon|savetheamazon|amazonrainforest|actfortheamazon|amazonforest|amazonian|amazonia|amazonasenllamas|mercosur|lamazzonia|amazonas|brazilian amazon|brazil amazon|rain forest|amazona|brazil amazon|prayforamazonia|prayforamazonas|forest fires|save the amazon"

remove_dictionary <-  "indonesia|uganda|westpapua|zambia|africa|thiopia|guyana|new york|sri lanka|mozambique|cambodia|srilanka|job alert|unicorn|donnabrazile|google|facebook|netflix|employees|corporate|corporation|amazonprime|primeday|amazonhq2|prime day|amazon prime|investigateamazon|fightfor15|amazonwearenotrobots|amazon25|google|amazonprimeday|tarifvertrag|parteiergreifen|ryanair|amazonstrike|boycottamazon|apple|bourdindirect|facebook|amazonprimeday|mindestlohn|zalando|otto|amazonlgbtroadshow|retail|tax|netflix|twitchtv|ecommerce|giletjaune|lerciostory|articolo|workers|jobs|monopol|new york|venezuela|maduro|donna|petebuttigiegs|nonnato|anticompetitive|data protection|privacy|alexa|delivery|surveillance|debt|amazonbooks|warehouses|technology|nyc|amazon deal|hq|youtube|jeff bozo|jeffbezos|bezos|taxes|robotic|mercadolibre|engineer|amazonhelp|hbo|tech|muller|amazon prime"

amazon <- ALL %>%
  filter(str_detect(text, amazon_dictionary))

amazon <- amazon[!str_detect(amazon$text, remove_dictionary),]

amazon$sec <- amazon$fit

#write.table(amazon, file= "amazon_data.csv", fileEncoding = "UTF-8")


# DATA PROCESSING
#===============================================================================

amazon <- fread("~/amazon_data.csv")

amazon <- amazon %>%
  filter(sec>0.2)%>%
  dplyr::select(date, sec, group) %>%
  mutate(date=as.Date(date)) %>%
  arrange(date)%>%
  group_by(date, group) %>%
  mutate(n=sum(sec)) %>%
  distinct(date, group, .keep_all=TRUE) %>%
  dplyr::select(date, group, n)

db <-as.data.frame.matrix(xtabs(n~date+group, amazon))
setDT(db, keep.rownames = "date")[]
colnames(db) <- c("date", "adv", "citizens", "fri", "pol", "media")

db <- as.data.frame(db)

db$selectsclass <- rep("Amazon_rainforest",length(db$date))

#===============================================================================
#=================================== VAR =======================================
#===============================================================================

#================ PART A: ESTIMATING VAR and CALCULATING IRFs ==================

variables <- c("adv", "citizens", "fri", "pol", "media")

db[,2:6] <- normalize(db[,2:6], method = "range", range = c(0, 1))
db[db == 0] <- NA

# - logit transform all series
for (v in variables) {
  # - pulling the series-agenda for that group
  x <- db[,v]
  # - for some groups the last couple observations for each issues are NA,
  #     making these a 0
  x[which(is.na(x))] <-0.01
  # - adding 1 percentage point to avoid 0s before the logit transformation
  #x <- x + 0.01
  # - applying the non-linear transformation
  logit_x <- log(x / (1-x))
  # If a topic makes up 100 % on a day transform the inf value to 4.59512
  logit_x[mapply(is.infinite, logit_x)] <- 4.59512
  db[,v] <- logit_x
}

# - creating a list with the variables of interest
variables_single <- c("adv", "citizens", "fri", "pol", "media")
variables_all <- c("adv", "citizens", "fri", "pol", "media", "selectsclass")

#========================= PART B: Model Maker =============================

# Find best model for all Topics:
##########################################################################################

p_for_each_topic <- data.frame("Amazon_rainforest",c(1),c(2),c(3),c(4))
colnames(p_for_each_topic) <- c("Topics", "Best P AIC", "Best P HQ", "Best P SC", "Best P FPE")

mod_conf <- list(c("Amazon_rainforest"))

for(f in 1:length(mod_conf)){
  top_mod <- unlist(mod_conf[f])
  finddb <- db %>% filter(selectsclass %in% top_mod)
  finddb$pubDateTime <- as.numeric(lubridate::ymd(as.character(finddb$date))) # corrected MK
  
  finddb$selectsclass <- as.factor(finddb$selectsclass)
  topic_levels <- levels(finddb$selectsclass)
  
  finddb$selectsclass <- as.numeric(finddb$selectsclass)
  mformula_f <- formula(paste0("~",
                               paste0(variables, collapse = " + ")))
  model_data_f <- model.matrix(mformula_f, finddb[, variables])
  model_data_f <- model_data_f[, 2:ncol(model_data_f)] # removing intercept
  
  #Dickey-Fuller Test to determine Stationary differences to achieve stationarity.
  stationarity <- ndiffs(model_data_f, alpha = 0.01, test = c("adf"))
  
  if(stationarity == 0){
    # - splitting the covariates of interest from the issue dummy variables
    X_endogenous_f <- model_data_f[, which(!grepl("selectsclass", colnames(model_data_f)))]
    X_exogenous_f <- model_data_f[, which(grepl("selectsclass", colnames(model_data_f)))]
    
    if(ncol(X_exogenous_f) == 0){
      var_model_find_lag <- VARselect(y = X_endogenous_f, lag.max = 18)
    } else {
      var_model_find_lag <- VARselect(y = X_endogenous_f, exogen = X_exogenous_f, lag.max = 18)
    }
    p_for_each_topic[f,2:5] <- var_model_find_lag$selection
  } else {
    cat("The Topic ", mod_conf[f], " is not stationary at the moment!\n")
    # Since this never triggers we do not need to transform the data!
  }
  
}

#check model specification: it tells us to use a lag order of 1. 
p_for_each_topic

# Calculate Models with lag = number of covariates
##########################################################################################

var_model_merged <- list()
var_irfs_cum_merged <- list()

mod_conf <- list(c("Amazon_rainforest"))

for(j in 1:length(mod_conf)){
  
  configuration_j <- mod_conf[[j]]
  if(j == 5){configuration_j <- unlist(configuration_j)}
  maindb <- db
  maindb$date <- as.numeric(lubridate::ymd(as.character(maindb$date))) # corrected MK
  
  # - a formula object that will facilitate transforming the topic variable from
  # - factor to dummies
  maindb$selectsclass <- as.factor(maindb$selectsclass)
  topic_levels <- levels(maindb$selectsclass)
  
  variables <- variables_single
  if(j == 5){variables <- variables_all}
  
  maindb$selectsclass <- as.character(maindb$selectsclass)
  mformula <- formula(paste0("~ ",
                             paste0(c(variables), collapse = " + ")))
  model_data <- model.matrix(mformula, maindb[, variables]) 
  model_data <- model_data[, 2:ncol(model_data)] # removing intercept
  
  # - splitting the covariates of interest from the issue dummy variables
  X_endogenous <- model_data[, which(!grepl("selectsclass", colnames(model_data)))]
  X_exogenous <- model_data[, which(grepl("selectsclass", colnames(model_data)))]
  
  if(length(X_exogenous) == 0){
    # - estimating the model: 1 lag order
    var_model_merged_tmp <- VAR(y = X_endogenous, p = 1) 
    
    var_irfs_cum_merged_tmp <- irf(var_model_merged_tmp, n.ahead = 60, cumulative = TRUE, runs = 1000)
  } else {
    # - estimating the model: 1 lag order
    var_model_merged_tmp <- VAR(y = X_endogenous, p = 1, exogen = X_exogenous)
    
    var_irfs_cum_merged_tmp <- irf(var_model_merged_tmp, n.ahead = 60, cumulative = TRUE, runs = 1000)
  }
  #Save different Models in List of Lists
  if(j == 1){
    var_model_merged <- list(var_model_merged_tmp)
    var_irfs_cum_merged <- list(var_irfs_cum_merged_tmp)
  } else {
    var_model_merged[[j]] <- var_model_merged_tmp
    var_irfs_cum_merged[[j]] <- var_irfs_cum_merged_tmp
    
  }
}

# - save models for later usage
saveRDS(var_model_merged, "~/var_model_all_small.RDS")
saveRDS(var_irfs_cum_merged, "~/var_irfs_main_all_small.RDS")

#========================= PART C: Testing: Calculating ONE-TIME and PERMANENT 10-POINT EFFECTS =============================

var_irfs <- var_irfs_cum_merged
irf_data <- list()

# Loop through all configurations
for(k in 1:length(var_irfs)){
  # Get Model K:
  var_irfs_k <- var_irfs[[k]]
  
  variables <- names(var_irfs_k$irf)
  
  # - a list with the elements of interest from the IRF object
  elements_to_pull <- c("irf", "Upper", "Lower")
  
  # - initializing and filling a dataset with the IRF info
  irf_data_tmp <- NULL
  for (el in elements_to_pull) {
    new_irf_info <- var_irfs_k[el][[1]]
    for (out in variables) {
      new_irf_var_data <- as.data.frame(new_irf_info[out][[1]])
      # - take inverse logit to transform the effects to percentage point changes
      new_irf_var_data_transf <- as.data.frame(
        sapply(1:ncol(new_irf_var_data), function(j)
          inv.logit(new_irf_var_data[,j]) - 0.5))
      colnames(new_irf_var_data_transf) <- colnames(new_irf_var_data)
      new_irf_var_data_long <- new_irf_var_data_transf %>%
        tidyr::gather(cov, value)
      new_irf_var_data_long$out <- out
      new_irf_var_data_long$day <- rep(1:nrow(new_irf_var_data), 
                                       length(unique(new_irf_var_data_long$cov)))
      new_irf_var_data_long$e_type <- el
      irf_data_tmp <- rbind(irf_data_tmp, new_irf_var_data_long)
    }
  }
  
  # - give easier labels to the estimate types (e.g. Lower --> lwr)
  irf_data_tmp$e_type <- recode(irf_data_tmp$e_type,
                                `irf` = "pe",
                                `Lower` = "lwr", 
                                `Upper` = "upr")
  if(k == 1){
    irf_data <- list(irf_data_tmp)
  } else {
    irf_data[[k]] <- irf_data_tmp
  }
  
}

# MAIN
##########################################################################################

for(l in 1:length(irf_data)){
  new_irf_data <- NULL
  # - a vector with the name of the variables
  variables <- unique(irf_data[[l]]$cov)
  
  # - deciding the number of days to simulate
  DAYS <- 60
  
  tmpirf <- irf_data[[l]]
  
  irf_data_tmp <- tmpirf %>%
    filter(day <= (DAYS + 1))
  
  # - iterating through covariates
  for (covariate in variables) {
    # -iterating through outcomes
    for (outcome in variables) {
      # - skipping when covariate and response are the same
      if (covariate != outcome) {
        # - initializing a cummulative-shocks matrix for this scenario: two 3-dim 
        # - matrix, one matrix for the covariate and one matrix for the response,
        # - and one dimension for the point estimate and the two other dimensions
        # - for the lower and upper bounds of the estimate
        cov_mat <- array(0, dim = c(DAYS, DAYS, 3))
        out_mat <- array(0, dim = c(DAYS, DAYS, 3))
        
        # - pull the full 15-day IRFs for the endogenous covariate
        cov_resp <- irf_data_tmp %>%
          filter(cov == covariate, out == covariate) %>%
          # - remove 1st row: it's part of the set-up (repsonse at day 0)
          filter(day != 1) %>%
          mutate(day = day -1)
        
        # - pull the full 15-day IRFs for the particular outcome variable
        out_resp <- irf_data_tmp %>%
          filter(cov == covariate, out == outcome) %>%
          # - remove 1st row: it's part of the set-up (repsonse at day 0)
          filter(day != 1) %>%
          mutate(day = day -1)
        
        # - transforming the 15-day IRFs for the covariate and outcome to a wide
        # - 3-column format (one column per estimate type: pe, lwr, upr)
        or_cov_resp <- cov_resp %>%
          dplyr::select(day, value, e_type) %>%
          tidyr::spread(e_type, value) %>%
          dplyr::select(-day)
        
        or_out_resp <- out_resp %>%
          dplyr::select(day, value, e_type) %>%
          tidyr::spread(e_type, value) %>%
          dplyr::select(-day)
        
        # - fill up the first rows of the scenario matrices with the original 
        # - 1-day shock responses
        cov_mat[1,,1:3] <- or_cov_resp %>%
          as.matrix()
        out_mat[1,,1:3] <- or_out_resp %>%
          as.matrix()
        
        for (i in 2:DAYS) {
          # - iterating through the rest of the 15 days, beyond the 1st one
          # - chekcing first how much attention the covariate group is predicted 
          # - to pay to the issue in day i-1
          cov_att_pe <- sum(cov_mat[,(i-1),2])
          
          # - calculating how big a new shock needs to be in order for the 
          # - covariate group to keep its attention to 100%
          cov_new_shock <- 1 - cov_att_pe
          
          # - re-scaling the original 100 percentage point shock to the new shock
          cov_new_resp <- or_cov_resp[1:(DAYS-(i-1)),] * cov_new_shock
          out_new_resp <- or_out_resp[1:(DAYS-(i-1)),] * cov_new_shock
          
          # - adding the response to this new shock to the scenario matrices
          cov_mat[i,i:DAYS,1:3] <- cov_new_resp %>%
            as.matrix()
          out_mat[i,i:DAYS,1:3] <- out_new_resp %>%
            as.matrix()
        }
        # - saving the output for this cov --> out 
        new_rows <- rbind(
          data.frame(
            cov = covariate,
            value = colSums(out_mat[,,1]),
            out = outcome,
            day = 1:DAYS,
            e_type = "lwr",
            data_type = "structural"),
          data.frame(
            cov = covariate,
            value = colSums(out_mat[,,2]),
            out = outcome,
            day = 1:DAYS,
            e_type = "pe",
            data_type = "structural"),
          data.frame(
            cov = covariate,
            value = colSums(out_mat[,,3]),
            out = outcome,
            day = 1:DAYS,
            e_type = "upr",
            data_type = "structural")
        )
        new_irf_data <- rbind(new_irf_data, new_rows)
      }
    }
  }
  
  # - merging this new type of IRFs with the "regular" 1-time-shock IRFs
  tmpirf$data_type <- "one_time_shock"
  tmpirf <- tmpirf %>%
    # - correct the original data for the fact that day 1 is just pre-setting
    filter(day != 1) %>% 
    mutate(day = day -1)
  
  all_irf_data <- rbind(tmpirf, new_irf_data)
  
  # - removing from the dataset cases in which covariate and outcome are the same
  all_irf_data <- all_irf_data %>%
    filter(cov != out)
  
  # - a wide version of the dataset, with a separate column for each estimate type
  all_irf_data_wide <- all_irf_data %>%
    tidyr::spread(e_type, value)
  
  # - simulate one-time and structural shocks of 10 instead of 100 percentage pts.
  # - Present the results in 0-100 scale instead of 0-1
  all_irf_data_wide <- all_irf_data %>%
    mutate(value = (value / 10) * 100) %>%
    tidyr::spread(e_type, value)
  
  # - humand readble labels for the covariates and outcomes
  all_irf_data_wide$cov <- recode(all_irf_data_wide$cov,
                                  `adv` = "Advocates",
                                  `fri` = "Politicians'\n Twitter Friends",
                                  `citizens` = "Citizens on\n Twitter",
                                  `media` = "Media",
                                  `pol` = "Politicians and\n Governmental Institutions")
  
  all_irf_data_wide$out <- recode(all_irf_data_wide$out,
                                  `adv` = "Advocates",
                                  `fri` = "Politicians'\n Twitter Friends",
                                  `citizens` = "Citizens on\n Twitter",
                                  `media` = "Media",
                                  `pol` = "Politicians and\n Governmental Institutions")
  
  # - reorder the levels of the outcome and covariate factor variables
  all_irf_data_wide$out <- factor(all_irf_data_wide$out,
                                  levels = c("Politicians and\n Governmental Institutions",
                                             "Media",
                                             "Advocates",
                                             "Politicians'\n Twitter Friends",
                                             "Citizens on\n Twitter"))
  
  all_irf_data_wide$cov <- factor(all_irf_data_wide$cov,
                                  levels = c("Politicians and\n Governmental Institutions",
                                             "Media",
                                             "Advocates",
                                             "Politicians'\n Twitter Friends",
                                             "Citizens on\n Twitter"))
  
  # - better labels for the data type
  all_irf_data_wide$data_type <- recode(
    all_irf_data_wide$data_type,
    `one_time_shock` =  "Effect of a one time 10 percentage point attention increase at day 0",
    `structural` = "Effect of a structural 10 percentage point attention increase at day 0")
  
  
  if(l == 1){
    all_irf_data_wide_all <- list(all_irf_data_wide)
  } else {
    all_irf_data_wide_all[[l]] <- all_irf_data_wide
  }
}

saveRDS(all_irf_data_wide_all,
        "~/onetime-structural-shock-irfs-results_all_small.RDS")
write.csv(all_irf_data_wide,
          "~/onetime-structural-shock-irfs-results_all.csv")

##########################################################################################

#===============================================================================
#======================== PART C: GENERATING FIGURE 5 ==========================
#===============================================================================

media_plot_db <- all_irf_data_wide %>%
  filter(day == 7) %>%
  # - looking only at one time 10 point effects
  filter(data_type == 'Effect of a one time 10 percentage point attention increase at day 0') %>%
  # - only exploring the MEDIA EFFECTS
  filter(cov == "Media") %>%
  mutate(data_type = "Effect of the Media on:") %>%
  rename(x = cov, y = out)


adv_plot_db <- all_irf_data_wide %>%
  filter(day == 7) %>%
  # - looking only at one time 10 point effects
  filter(data_type == 'Effect of a one time 10 percentage point attention increase at day 0') %>%
  # - only exploring the ADVOCATE EFFECTS
  filter(cov == "Advocates") %>%
  mutate(data_type = "Effect of Advocates on:") %>%
  rename(x = cov, y = out)


pol_plot_db <- all_irf_data_wide %>%
  filter(day == 7) %>%
  # - looking only at one time 10 point effects
  filter(data_type == 'Effect of a one time 10 percentage point attention increase at day 0') %>%
  # - only exploring the POLITICIANS EFFECTS
  filter(cov == "Politicians and\n Governmental Institutions") %>%
  mutate(data_type = "Effect of Politicians and Governmental Institutions on:") %>%
  rename(x = cov, y = out)


cit_plot_db <- all_irf_data_wide %>%
  filter(day == 7) %>%
  # - looking only at one time 10 point effects
  filter(data_type == 'Effect of a one time 10 percentage point attention increase at day 0') %>%
  # - only exploring the CITIZENS EFFECTS
  filter(cov == "Citizens on\n Twitter") %>%
  mutate(data_type = "Effect of Citizens on Twitter on:") %>%
  rename(x = cov, y = out)


all_plot_db <- rbind(pol_plot_db, cit_plot_db, media_plot_db, adv_plot_db)

pdf("~/Figure-5.pdf", width = 20, height = 15)
ggplot(all_plot_db,
       aes(x = y, y = pe, ymin = lwr, ymax = upr)) +
  geom_segment(aes(x = y, xend = y, y = lwr, yend = upr), 
               size = 4, alpha = 0.3, color= "blue") +
  geom_point(position = position_dodge(width = 0.55), size = 5)+
  geom_hline(yintercept = 0, color = "red", size= 3) +
  facet_wrap(~data_type, nrow = 2) +
  coord_flip() +
  geom_vline(xintercept = 15) +
  xlab("Audience") +
  ggtitle("Securitizing actor")+
  scale_y_continuous("\nThe 7-day effect of a one time 10 percentage point increase in attention in day 0",
                     expand = c(0,0.03))+
  theme(
    panel.spacing = unit(1.05, "lines"),
    title =element_text(size=16),
    legend.position = "bottom",
    panel.background = element_blank(),
    panel.grid.major = element_line(colour = "gray90", linetype = "solid"),
    axis.text = element_text(size = 16),
    axis.text.y = element_text(hjust=0),
    strip.text = element_text(size = 16),
    panel.border = element_rect(colour = "black", fill = FALSE),
    strip.background = element_rect(colour = "black"),
    axis.title = element_text(size = 20),
    legend.text = element_text(size = 16))
dev.off()